This notebook is analyzing the pixy output for the Ceratopipra rubrocapilla dataset.

First, load some functions for loading the data. Depending on the downstream analysis, I want the data either in “long” format (generally recommended) or “wide” format.

#convert pixy data to long format
#This function came from the pixy documentation
pixy_to_long <- function(pixy_files){
  pixy_df <- list()
  for(i in 1:length(pixy_files)){
    stat_file_type <- gsub(".*_|.txt", "", pixy_files[i])
    if(stat_file_type == "pi"){
      df <- read_delim(pixy_files[i], delim = "\t")
      df <- df %>%
        gather(-pop, -window_pos_1, -window_pos_2, -chromosome,
               key = "statistic", value = "value") %>%
        rename(pop1 = pop) %>%
        mutate(pop2 = NA)
      pixy_df[[i]] <- df
    } else{
      df <- read_delim(pixy_files[i], delim = "\t")
      df <- df %>%
        gather(-pop1, -pop2, -window_pos_1, -window_pos_2, -chromosome,
               key = "statistic", value = "value")
      pixy_df[[i]] <- df
    }
  }
  bind_rows(pixy_df) %>%
    arrange(pop1, pop2, chromosome, window_pos_1, statistic)
}


#convert pixy data to wide format
pixy_to_wide <- function(pixy_files){
  pixy_df <- list()
  for(i in 1:length(pixy_files)){
    stat_file_type <- gsub(".*_|.txt", "", pixy_files[i])
    if(stat_file_type == "pi"){
      df <- read_delim(pixy_files[i], delim = "\t")
            pixy_df[[i]] <- df %>%
            select(pop, chromosome, window_pos_1, window_pos_2, no_sites, avg_pi) %>%
            spread(key=pop, value=avg_pi)
    } else if(stat_file_type == "dxy"){
      df <- read_delim(pixy_files[i], delim = "\t")
            pixy_df[[i]] <- df %>%
            unite("pop", pop1:pop2, remove = TRUE) %>%
            select(pop, chromosome, window_pos_1, window_pos_2, no_sites, avg_dxy) %>%
            spread(key=pop, value=avg_dxy)
    } else{
      df <- read_delim(pixy_files[i], delim = "\t")
            pixy_df[[i]] <- df %>%
            unite("pop", pop1:pop2, sep = "_Fst_", remove = TRUE)%>%
            select(pop, chromosome, window_pos_1, window_pos_2, no_snps, avg_wc_fst) %>%
            spread(key=pop, value=avg_wc_fst)
    }
  }
  #merge all the dataframes according to position
  pixy_df %>% reduce(left_join, by = c("chromosome", "window_pos_1", "window_pos_2"))
}

Now, load the data!

#I have a couple different datasets in different folders; pick which one to analyze
#fairfilters are the filters that I consider to be a good balance between resolution and noise (window size 500 kb) 

#there are two datasets are for the publication. "headwaterslabelled" divides the dataset into the main areas of endemism (Inambari, Rondonia, Tapajos, Xingu, and the Atlantic Forest. Headwater samples are separated to separate populations for Rondonia, Tapajos, and Xingu.) The "subpops" dataset further separates Inambari into East and West, and Xingu into north and central, and the Atlantic Forest into its two sampling locations.
folder <- "window_500k_37_0.2_5_20.0_headwaterslabelled" #This sample has Xingu, Tapajos, and Rondonia headwater samples separated from their respective populations, but Central/North Xingu are combined, as are East and West Inambari and the two Atlantic Forest populations.
folder <- "window_500k_37_0.2_5_20.0_subpops" #This sample has Xingu, Tapajos, and Rondonia headwater samples separated from their respective populations, and Central & North Xingu are separated, as are East and West Inambari and the two Atlantic Forest populations.


#get a list of chromosomes to read. I have each chromosome in a separate file. This is just the prefix of the pixy files. If you have a single prefix, just define "chroms" as whatever that prefix is.
#files<-c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29") #All autosomes
files<-c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29", "chr_Znopseudo_2het_trimmedP0") #All chromosomes including chrZ

#there are a couple comparisons available to test bioinformatic filters on the Z chromosome, used to test robustness of results (results are qualitatively very similar, but removing the pseudoautosomal region does decrease estimates of diversity quite a bit - not surprising and likely a combination of a real biological difference across the chromosome and increased error rate due to mismapped reads)
#files<-c("chr_Z") #chrZ without filters to remove pseudoautosomal/chrW region
#files<-c("chr_Znopseudo") #chrZ with filters to remove pseudoautosomal region, but not other areas where W likely aligns

#loop through the chosen files and load the data

#setup empty df to hold data
pixy_df <- NULL
#loop through chromosomes, reading their data and adding it to the dataframe
for(file in files){
  print(c("loading ", file))
  chrom_df <- pixy_to_long(c(paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_dxy.txt", sep=""), paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_pi.txt", sep=""), paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_fst.txt", sep="")))
  pixy_df <- rbind(pixy_df, chrom_df)
}


#we can load an alternative dataset for Fst, with a min minor allele frequency filter of 0.05. Alleles with frequency below 0.05 may be less informative.
#choose the Fst dataset that corresponds to the dxy/pi dataset above
fstfolder <- "window_500k_37_0.2_5_20.0_qualmaf5_headwaterslabelled"
fstfolder <- "window_500k_37_0.2_5_20.0_qualmaf5_subpops"

#setup empty df to hold data
pixy_fstdf <- NULL
#loop through chromosomes, reading their data and adding it to the dataframe
for(file in files){
  print(c("loading ", file))
  chrom_df <- pixy_to_long(c(paste("Ceratopipra_rubrocapilla/", fstfolder, "/", file, "_fst.txt", sep="")))
  pixy_fstdf <- rbind(pixy_fstdf, chrom_df)
}


#this dataset was to look for areas around the pseudoautosomal region where W sequences may map to the Z
fstfolder <- "chrZ_investigation/Ceratopipra_rubrocapilla/window_1bp_chrZ"
files<-c("chr_Z") #chr_Z was investigated for differentiation between males and females

#setup empty df to hold data
pixy_chrZ_investigation_fstdf <- NULL
#loop through chromosomes, reading their data and adding it to the dataframe
for(file in files){
  print(c("loading ", file))
  chrom_df <- pixy_to_long(c(paste("Ceratopipra_rubrocapilla/", fstfolder, "/", file, "_fst.txt", sep="")))
  pixy_chrZ_investigation_fstdf <- rbind(pixy_chrZ_investigation_fstdf, chrom_df)
}
#I needed to do this when my population was just named "F", which R interprets as "false" 
pixy_chrZ_investigation_fstdf$pop1 <- "Male"
pixy_chrZ_investigation_fstdf$pop2 <- "Female"


# create a custom labeller for special characters in pi/dxy/fst, so that the labels look nicer in the plot
pixy_labeller <- as_labeller(c(avg_pi = "pi", avg_dxy = "D[XY]", avg_wc_fst = "F[ST]"), default = label_parsed)

I also want a long format dataframe so I can calculate some statistics more easily.

#setup empty df to hold data
pixy_df2 <- NULL
#loop through chromosomes, reading their data and adding it to the dataframe
for(file in files){
  print(c("loading ", file))
  chrom_df <- pixy_to_wide(c(paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_dxy.txt", sep=""), paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_pi.txt", sep=""), paste("Ceratopipra_rubrocapilla/", folder, "/", file, "_fst.txt", sep="")))
  pixy_df2 <- rbind(pixy_df2, chrom_df)
}

Calculate means

Let’s quickly observe the means for each statistic. To properly get the mean of pi and dxy, we cannot simply average across all windows since each window could have variable amounts of missing data - rather, we will sum all the differences between all individuals across all sites, and divide that by the total number of sites that were compared across all sites and all individuals.

#simple but less correct way
#means <- pixy_df %>% group_by(pop1, pop2, statistic) %>% summarise(mean=mean(value, na.rm=T))
#means

#more correct way

#firstly, can decide to filter to look at only some chromosomes (eg, exclude sex chromosomes). I am selecting only the autosomes. I will look at chrZ separately as it has a different effective population size and is expected to experience different levels of diversity, divergence, and differentiation.
chromosomes<-c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29")

#make a list of populations to analyze

#could look at subpopulations. Note that the order here will affect what order they are reported in.
populations <- c("AtlanticForest", "AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Inambari", "Rondonia", "RondoniaHeadwaters", "Tapajos", "TapajosHeadwaters", "XinguC", "XinguHeadwaters", "XinguN", "XinguBelem")

#alternatively, could look at just the main populations
#populations <- c("Inambari", "Rondonia", "Tapajos", "XinguBelem", "AtlanticForest")

#calculate genome-wide pi for each population selected above
print("calculating genome-wide pi per species")
## [1] "calculating genome-wide pi per species"
for(species1 in populations){
  temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(is.na(pop2)) %>% dplyr::filter(chromosome %in% chromosomes)
  differences<- sum(temp[temp$statistic=="count_diffs",]$value, na.rm=T)
  comparisons<- sum(temp[temp$statistic=="count_comparisons",]$value, na.rm=T)
  pi<- differences/comparisons
  if(!is.na(pi)){print(c(species1, pi))}
}
## [1] "AtlanticForest_Ibateguara" "0.00503835003117907"      
## [1] "AtlanticForest_Ilheus" "0.00493296640492069"  
## [1] "InambariE"           "0.00557554511276617"
## [1] "InambariW"           "0.00566896652201885"
## [1] "Rondonia"            "0.00562052514679235"
## [1] "RondoniaHeadwaters"  "0.00611820259917407"
## [1] "Tapajos"             "0.00565452806266693"
## [1] "TapajosHeadwaters"  "0.0053105054142538"
## [1] "XinguC"              "0.00522491396820343"
## [1] "XinguHeadwaters"     "0.00526687057716538"
## [1] "XinguN"              "0.00565812572173657"
#calculate chr_Z pi
print("calculating Z chromosome pi per species")
## [1] "calculating Z chromosome pi per species"
for(species1 in populations){
  temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(is.na(pop2)) %>% dplyr::filter(chromosome %in% "chr_Z")
  differences<- sum(temp[temp$statistic=="count_diffs",]$value, na.rm=T)
  comparisons<- sum(temp[temp$statistic=="count_comparisons",]$value, na.rm=T)
  pi<- differences/comparisons
  if(!is.na(pi)){print(c(species1, pi))}
}
## [1] "AtlanticForest_Ibateguara" "0.00198188972662968"      
## [1] "AtlanticForest_Ilheus" "0.00204469940951736"  
## [1] "InambariE"           "0.00251284498975208"
## [1] "InambariW"           "0.00295681006578369"
## [1] "Rondonia"            "0.00278878261691745"
## [1] "RondoniaHeadwaters"  "0.00184158388491328"
## [1] "Tapajos"             "0.00268382905082644"
## [1] "TapajosHeadwaters"   "0.00273212418856823"
## [1] "XinguC"              "0.00193308810825293"
## [1] "XinguHeadwaters"   "0.002075674881349"
## [1] "XinguN"              "0.00243421667502145"
#this is an alternative, quicker and less correct way to get an estimate of genome wide pi, I would like to compare
#calculate genome-wide pi
print("calculating genome-wide pi per species, averaged across windows")
## [1] "calculating genome-wide pi per species, averaged across windows"
pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(!is.na(value)) %>% dplyr::filter(chromosome %in% chromosomes)  %>% dplyr::filter(statistic=="avg_pi") %>% group_by(pop1) %>% summarise(pi=mean(value), nopi= sum(value ==0)/n(), lowpi= sum(value < 0.001)/n(), medlowpi= sum(value < 0.002)/n())
## # A tibble: 0 × 5
## # ℹ 5 variables: pop1 <chr>, pi <dbl>, nopi <dbl>, lowpi <dbl>, medlowpi <dbl>
#calculate genome-wide dxy
print("calculating genome-wide dxy per species")
## [1] "calculating genome-wide dxy per species"
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% chromosomes)
    differences<- sum(temp[temp$statistic=="count_diffs",]$value, na.rm=T)
    comparisons<- sum(temp[temp$statistic=="count_comparisons",]$value, na.rm=T)
    dxy<- differences/comparisons
    if(!is.na(dxy)){
      print(c(species1, species2, dxy))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.00530778294004487"      
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.00583109215836177"      
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.00589548291664156"      
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.00578485520333174"      
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.00593167137680746"      
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.00571182551604985"      
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.00579285776032775"      
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.00550761340513998"      
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.00580728848895484"      
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.00564192173772957"      
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.00579858817516961"  
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.00587403302784634"  
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.00575046070368699"  
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.00591701184622928"  
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.00569653384146549"  
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.00576541396679203"  
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.00550914251947081"  
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.00577398328769916"  
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.00562635259232494"  
## [1] "InambariE"           "Rondonia"            "0.00568858494555003"
## [1] "InambariE"           "RondoniaHeadwaters"  "0.00589168858529891"
## [1] "InambariE"           "Tapajos"             "0.00576056031701176"
## [1] "InambariE"           "TapajosHeadwaters"   "0.00571948556378042"
## [1] "InambariE"           "XinguC"              "0.00580444608529642"
## [1] "InambariE"           "XinguHeadwaters"     "0.00572973130526026"
## [1] "InambariE"          "XinguN"             "0.0058043683385902"
## [1] "InambariW"           "InambariE"           "0.00571528237889819"
## [1] "InambariW"           "Rondonia"            "0.00574812131270173"
## [1] "InambariW"           "RondoniaHeadwaters"  "0.00597167876608276"
## [1] "InambariW"           "Tapajos"             "0.00582146282106594"
## [1] "InambariW"           "TapajosHeadwaters"   "0.00576702396034367"
## [1] "InambariW"           "XinguC"              "0.00586684158651894"
## [1] "InambariW"           "XinguHeadwaters"     "0.00577843210907123"
## [1] "InambariW"           "XinguN"              "0.00585542207339537"
## [1] "Rondonia"            "RondoniaHeadwaters"  "0.00587357783807833"
## [1] "Rondonia"            "Tapajos"             "0.00570201724587135"
## [1] "Rondonia"            "TapajosHeadwaters"   "0.00566925236323599"
## [1] "Rondonia"            "XinguC"              "0.00574551421739934"
## [1] "Rondonia"            "XinguHeadwaters"     "0.00567959469930083"
## [1] "Rondonia"            "XinguN"              "0.00573592496217393"
## [1] "RondoniaHeadwaters"  "Tapajos"             "0.00590585193367116"
## [1] "RondoniaHeadwaters"  "TapajosHeadwaters"   "0.00579377724347487"
## [1] "RondoniaHeadwaters"  "XinguC"              "0.00592594681077587"
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00580808772573003"
## [1] "RondoniaHeadwaters"  "XinguN"              "0.00593768394474742"
## [1] "Tapajos"             "TapajosHeadwaters"   "0.00569133878891467"
## [1] "Tapajos"             "XinguC"              "0.00567772762913389"
## [1] "Tapajos"             "XinguHeadwaters"     "0.00571097518619998"
## [1] "Tapajos"             "XinguN"              "0.00569098708242333"
## [1] "TapajosHeadwaters"   "XinguC"              "0.00576496119172132"
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "0.00545645343339129"
## [1] "TapajosHeadwaters"   "XinguN"              "0.00572857606699595"
## [1] "XinguC"              "XinguHeadwaters"     "0.00573582316781036"
## [1] "XinguC"              "XinguN"              "0.00561537482898087"
## [1] "XinguHeadwaters"     "XinguN"              "0.00576610574106164"
#repeat for chrZ
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% "chr_Z")
    differences<- sum(temp[temp$statistic=="count_diffs",]$value, na.rm=T)
    comparisons<- sum(temp[temp$statistic=="count_comparisons",]$value, na.rm=T)
    dxy<- differences/comparisons
    if(!is.na(dxy)){
      print(c(species1, species2, dxy))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.00225052572258383"      
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.00292354096210425"      
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.00305376530365379"      
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.00290429546620253"      
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.00271711408644503"      
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.00272305062231909"      
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.00284353888877947"      
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.00242945930747546"      
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.00275244572053365"      
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.00267480011732309"      
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.00297261959698508"  
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.00314351509058426"  
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.00298134224141317"  
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.00285030562631942"  
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.0028176512659944"   
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.00297315684855059"  
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.0024780024526798"   
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.00283051112485661"  
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.00272094188611383"  
## [1] "InambariE"           "Rondonia"            "0.00293471604564043"
## [1] "InambariE"           "RondoniaHeadwaters"  "0.00279278168989145"
## [1] "InambariE"           "Tapajos"             "0.00295930514744464"
## [1] "InambariE"           "TapajosHeadwaters"   "0.00300387854967193"
## [1] "InambariE"           "XinguC"              "0.00292444747462284"
## [1] "InambariE"           "XinguHeadwaters"     "0.00282655424471302"
## [1] "InambariE"           "XinguN"              "0.00303562458449719"
## [1] "InambariW"           "InambariE"           "0.00291597571186651"
## [1] "InambariW"           "Rondonia"            "0.00304237789149533"
## [1] "InambariW"           "RondoniaHeadwaters"  "0.00294737863972948"
## [1] "InambariW"           "Tapajos"             "0.00308311768802784"
## [1] "InambariW"           "TapajosHeadwaters"   "0.00314131335061145"
## [1] "InambariW"           "XinguC"              "0.00304966478915546"
## [1] "InambariW"           "XinguHeadwaters"     "0.00297794933347265"
## [1] "InambariW"           "XinguN"              "0.00311521839754724"
## [1] "Rondonia"            "RondoniaHeadwaters"  "0.00274374251146055"
## [1] "Rondonia"            "Tapajos"             "0.00290231783530365"
## [1] "Rondonia"            "TapajosHeadwaters"   "0.00293382731941904"
## [1] "Rondonia"            "XinguC"              "0.00289233442831631"
## [1] "Rondonia"            "XinguHeadwaters"     "0.00275708136323727"
## [1] "Rondonia"            "XinguN"              "0.00299206670530784"
## [1] "RondoniaHeadwaters"  "Tapajos"             "0.00279620111650543"
## [1] "RondoniaHeadwaters" "TapajosHeadwaters"  "0.0027665736540725"
## [1] "RondoniaHeadwaters"  "XinguC"              "0.00277435429928004"
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00256813192860593"
## [1] "RondoniaHeadwaters"  "XinguN"              "0.00285333107123302"
## [1] "Tapajos"             "TapajosHeadwaters"   "0.00292168326373273"
## [1] "Tapajos"             "XinguC"              "0.00272752606618029"
## [1] "Tapajos"            "XinguHeadwaters"    "0.0027538824817445"
## [1] "Tapajos"             "XinguN"              "0.00281251257582133"
## [1] "TapajosHeadwaters"   "XinguC"              "0.00291598649047621"
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "0.00256182249322493"
## [1] "TapajosHeadwaters"   "XinguN"              "0.00301228063857263"
## [1] "XinguC"             "XinguHeadwaters"    "0.0027683349474423"
## [1] "XinguC"              "XinguN"              "0.00262328418912049"
## [1] "XinguHeadwaters"     "XinguN"              "0.00283291916386493"
#compare the autosomal results to what it would have been to just take average dxy accross windows (autosomes). Similar?
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% chromosomes) %>% dplyr::filter(statistic=="avg_dxy")
    dxy<- mean(temp$value, na.rm=T)
    if(!is.na(dxy)){
      print(c(species1, species2, dxy, "do not use this"))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.00505184650393601"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.00561298481733318"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.00565909702355201"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.00554275203379782"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.00567114906367969"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.00547061709820232"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.00552787137563704"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.00524338966881735"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.00553587814823246"       "do not use this"          
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.00537252383278201"       "do not use this"          
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.00556038524559038"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.00561350226253956"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.00549358340324968"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.00564080578930097"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.00544074631375506"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.00548115399020422"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.00524068896972755"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.00549872401830841"  
## [4] "do not use this"      
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.00536239219081242"  
## [4] "do not use this"      
## [1] "InambariE"           "Rondonia"            "0.00545671434796454"
## [4] "do not use this"    
## [1] "InambariE"           "RondoniaHeadwaters"  "0.00565824379576142"
## [4] "do not use this"    
## [1] "InambariE"           "Tapajos"             "0.00553829214847797"
## [4] "do not use this"    
## [1] "InambariE"           "TapajosHeadwaters"   "0.00547777374177568"
## [4] "do not use this"    
## [1] "InambariE"           "XinguC"              "0.00556798006397839"
## [4] "do not use this"    
## [1] "InambariE"          "XinguHeadwaters"    "0.0054701293765781"
## [4] "do not use this"   
## [1] "InambariE"           "XinguN"              "0.00555024359241639"
## [4] "do not use this"    
## [1] "InambariW"           "InambariE"           "0.00548637683213361"
## [4] "do not use this"    
## [1] "InambariW"           "Rondonia"            "0.00549798331890128"
## [4] "do not use this"    
## [1] "InambariW"           "RondoniaHeadwaters"  "0.00572418761416959"
## [4] "do not use this"    
## [1] "InambariW"           "Tapajos"             "0.00557913925778164"
## [4] "do not use this"    
## [1] "InambariW"           "TapajosHeadwaters"   "0.00551163565152011"
## [4] "do not use this"    
## [1] "InambariW"           "XinguC"              "0.00562773718567372"
## [4] "do not use this"    
## [1] "InambariW"           "XinguHeadwaters"     "0.00550023967125916"
## [4] "do not use this"    
## [1] "InambariW"           "XinguN"              "0.00559869332700053"
## [4] "do not use this"    
## [1] "Rondonia"            "RondoniaHeadwaters"  "0.00561970375233636"
## [4] "do not use this"    
## [1] "Rondonia"            "Tapajos"             "0.00545630267415149"
## [4] "do not use this"    
## [1] "Rondonia"           "TapajosHeadwaters"  "0.0054035626957941"
## [4] "do not use this"   
## [1] "Rondonia"            "XinguC"              "0.00549720206280043"
## [4] "do not use this"    
## [1] "Rondonia"            "XinguHeadwaters"     "0.00539679119809566"
## [4] "do not use this"    
## [1] "Rondonia"            "XinguN"              "0.00546756850108052"
## [4] "do not use this"    
## [1] "RondoniaHeadwaters"  "Tapajos"             "0.00565828285000259"
## [4] "do not use this"    
## [1] "RondoniaHeadwaters"  "TapajosHeadwaters"   "0.00554152070784149"
## [4] "do not use this"    
## [1] "RondoniaHeadwaters"  "XinguC"              "0.00565374511232059"
## [4] "do not use this"    
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00552323531879886"
## [4] "do not use this"    
## [1] "RondoniaHeadwaters"  "XinguN"              "0.00565642440172314"
## [4] "do not use this"    
## [1] "Tapajos"             "TapajosHeadwaters"   "0.00543732299081528"
## [4] "do not use this"    
## [1] "Tapajos"             "XinguC"              "0.00542779229465016"
## [4] "do not use this"    
## [1] "Tapajos"             "XinguHeadwaters"     "0.00544188359745343"
## [4] "do not use this"    
## [1] "Tapajos"            "XinguN"             "0.0054305254982487"
## [4] "do not use this"   
## [1] "TapajosHeadwaters"   "XinguC"              "0.00549551128540814"
## [4] "do not use this"    
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "0.00517877912549543"
## [4] "do not use this"    
## [1] "TapajosHeadwaters" "XinguN"            "0.005451242689003"
## [4] "do not use this"  
## [1] "XinguC"              "XinguHeadwaters"     "0.00545908223530627"
## [4] "do not use this"    
## [1] "XinguC"              "XinguN"              "0.00534760836810585"
## [4] "do not use this"    
## [1] "XinguHeadwaters"     "XinguN"              "0.00547366158086268"
## [4] "do not use this"
#calculate genome-wide average fst from data not filtered for MAF
print("calculating genome-wide fst per species")
## [1] "calculating genome-wide fst per species"
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% chromosomes) %>% dplyr::filter(statistic=="avg_wc_fst")
    fst<- mean(temp$value, na.rm=T)
    if(!is.na(fst)){
      print(c(species1, species2, fst))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.0559237351476193"       
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.08379452916041"         
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.0806308190535946"       
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.0692839271787342"       
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.0761821203886025"       
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.0540450166656078"       
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.0972667730838019"       
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.0561003709750281"       
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.102157432474066"        
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.0545138281308276"       
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.087964865749365"    
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.0838545750310179"   
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.0708251313327606"   
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.0866049526112457"   
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.0585947960389383"   
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.10919357723805"     
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.0675760850771892"   
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.11239698548572"     
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.0667725479087515"   
## [1] "InambariE"           "Rondonia"            "0.00986450209549269"
## [1] "InambariE"           "RondoniaHeadwaters"  "0.00277742120545996"
## [1] "InambariE"          "Tapajos"            "0.0208347593481551"
## [1] "InambariE"         "TapajosHeadwaters" "0.014530767030149"
## [1] "InambariE"          "XinguC"             "0.0583993739144533"
## [1] "InambariE"          "XinguHeadwaters"    "0.0242866667681445"
## [1] "InambariE"         "XinguN"            "0.018034663444582"
## [1] "InambariW"           "InambariE"           "0.00985837207094967"
## [1] "InambariW"          "Rondonia"           "0.0161262322727293"
## [1] "InambariW"          "RondoniaHeadwaters" "0.0146392608350587"
## [1] "InambariW"          "Tapajos"            "0.0257227613797469"
## [1] "InambariW"          "TapajosHeadwaters"  "0.0227791620544152"
## [1] "InambariW"          "XinguC"             "0.0593676862802229"
## [1] "InambariW"          "XinguHeadwaters"    "0.0252786723129295"
## [1] "InambariW"          "XinguN"             "0.0240337864870077"
## [1] "Rondonia"            "RondoniaHeadwaters"  "0.00188267804386135"
## [1] "Rondonia"           "Tapajos"            "0.0101819924904845"
## [1] "Rondonia"            "TapajosHeadwaters"   "0.00565266475356613"
## [1] "Rondonia"           "XinguC"             "0.0446627245772016"
## [1] "Rondonia"           "XinguHeadwaters"    "0.0105315966740904"
## [1] "Rondonia"            "XinguN"              "0.00616455118175711"
## [1] "RondoniaHeadwaters"  "Tapajos"             "0.00645385120586868"
## [1] "RondoniaHeadwaters"   "TapajosHeadwaters"    "-0.00852823749926178"
## [1] "RondoniaHeadwaters" "XinguC"             "0.050280221610652" 
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00624815702252127"
## [1] "RondoniaHeadwaters" "XinguN"             "0.0146205622452009"
## [1] "Tapajos"             "TapajosHeadwaters"   "0.00906928110552536"
## [1] "Tapajos"            "XinguC"             "0.0299008814937627"
## [1] "Tapajos"            "XinguHeadwaters"    "0.0165672821681767"
## [1] "Tapajos"              "XinguN"               "-0.00609465457250011"
## [1] "TapajosHeadwaters"  "XinguC"             "0.0648891106455655"
## [1] "TapajosHeadwaters"    "XinguHeadwaters"      "-0.00742813846070223"
## [1] "TapajosHeadwaters"  "XinguN"             "0.0191464386706068"
## [1] "XinguC"             "XinguHeadwaters"    "0.0695408026189717"
## [1] "XinguC"             "XinguN"             "0.0260438463074593"
## [1] "XinguHeadwaters"   "XinguN"            "0.037825840897622"
#repeat for chrZ from data not filtered for MAF
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_df %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% "chr_Z") %>% dplyr::filter(statistic=="avg_wc_fst")
    fst<- mean(temp$value, na.rm=T)
    if(!is.na(fst)){
      print(c(species1, species2, fst))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.0767513004872105"       
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.178504891020353"        
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.150853130057025"        
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.132821268321934"        
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.242045743553954"        
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.0855055485368801"       
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.186663715221469"        
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.113710308879762"        
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.195379186117818"        
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.167014298908723"        
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.179607835679633"    
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.16041020178953"     
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.141764595822682"    
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.252922179082715"    
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.103676530768178"    
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.21077392125703"     
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.134742954869608"    
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.2070603658597"      
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.167446844533322"    
## [1] "InambariE"          "Rondonia"           "0.0621887034164426"
## [1] "InambariE"          "RondoniaHeadwaters" "0.0389419414730781"
## [1] "InambariE"          "Tapajos"            "0.0703445113842338"
## [1] "InambariE"          "TapajosHeadwaters"  "0.0436987734102871"
## [1] "InambariE"         "XinguC"            "0.133499982926829"
## [1] "InambariE"          "XinguHeadwaters"    "0.0183280427363163"
## [1] "InambariE"          "XinguN"             "0.0570950268862994"
## [1] "InambariW"          "InambariE"          "0.0247831416159527"
## [1] "InambariW"         "Rondonia"          "0.044312342150272"
## [1] "InambariW"          "RondoniaHeadwaters" "0.0729058718766133"
## [1] "InambariW"          "Tapajos"            "0.0615178489541107"
## [1] "InambariW"          "TapajosHeadwaters"  "0.0504808588770001"
## [1] "InambariW"         "XinguC"            "0.122870024580867"
## [1] "InambariW"          "XinguHeadwaters"    "0.0408302125724016"
## [1] "InambariW"          "XinguN"             "0.0557499270328146"
## [1] "Rondonia"           "RondoniaHeadwaters" "0.0549071352561874"
## [1] "Rondonia"           "Tapajos"            "0.0327410174634165"
## [1] "Rondonia"           "TapajosHeadwaters"  "0.0150758713368325"
## [1] "Rondonia"          "XinguC"            "0.106317415476861"
## [1] "Rondonia"            "XinguHeadwaters"     "0.00702254776991831"
## [1] "Rondonia"           "XinguN"             "0.0478191693488804"
## [1] "RondoniaHeadwaters" "Tapajos"            "0.0302485960923439"
## [1] "RondoniaHeadwaters" "TapajosHeadwaters"  "0.0658773514680335"
## [1] "RondoniaHeadwaters" "XinguC"             "0.148234259505614" 
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00307652283951968"
## [1] "RondoniaHeadwaters" "XinguN"             "0.0691707390193758"
## [1] "Tapajos"             "TapajosHeadwaters"   "-0.0117290958147507"
## [1] "Tapajos"            "XinguC"             "0.0588008368100614"
## [1] "Tapajos"            "XinguHeadwaters"    "-0.020997487397788"
## [1] "Tapajos"             "XinguN"              "-0.0250231874053785"
## [1] "TapajosHeadwaters" "XinguC"            "0.105060430356262"
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "-0.0380571867509218"
## [1] "TapajosHeadwaters"  "XinguN"             "0.0342305509166642"
## [1] "XinguC"            "XinguHeadwaters"   "0.106101967496665"
## [1] "XinguC"             "XinguN"             "0.0574165213841296"
## [1] "XinguHeadwaters"    "XinguN"             "0.0179128276365021"
#repeat for filtered data with MAF>0.05. Compare to above; qualitatively similar. This is the dataset we will use in the publication.
for(species1 in populations){
  for(species2 in populations){
    temp<- pixy_fstdf %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% chromosomes) %>% dplyr::filter(statistic=="avg_wc_fst")
    fst<- mean(temp$value, na.rm=T)
    if(!is.na(fst)){
      print(c(species1, species2, fst))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.0569580330641249"       
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.0884691695895341"       
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.100184184676407"        
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.0877331855875994"       
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.0624285518873619"       
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.0712250864898072"       
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.0538132145418404"       
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.0613455381169377"       
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.0915569741621504"       
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.0447365159719087"       
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.0941409964564655"   
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.10371651094003"     
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.0918227822125815"   
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.0732240393772699"   
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.0762082308494214"   
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.0674526425447455"   
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.0694105144150978"   
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.101022475438231"    
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.0536494966978228"   
## [1] "InambariE"         "Rondonia"          "0.014569500029232"
## [1] "InambariE"            "RondoniaHeadwaters"   "-0.00570681738057048"
## [1] "InambariE"          "Tapajos"            "0.0242910621423988"
## [1] "InambariE"           "TapajosHeadwaters"   "-0.0211341179535811"
## [1] "InambariE"          "XinguC"             "0.0595191055382136"
## [1] "InambariE"         "XinguHeadwaters"   "0.017452294436947"
## [1] "InambariE"          "XinguN"             "0.0106977861051491"
## [1] "InambariW"          "InambariE"          "0.0117503465497728"
## [1] "InambariW"          "Rondonia"           "0.0163552180739896"
## [1] "InambariW"          "RondoniaHeadwaters" "0.0107274059455641"
## [1] "InambariW"          "Tapajos"            "0.0248555429134874"
## [1] "InambariW"            "TapajosHeadwaters"    "-0.00574220053444205"
## [1] "InambariW"          "XinguC"             "0.0706234091405303"
## [1] "InambariW"          "XinguHeadwaters"    "0.0309836418593907"
## [1] "InambariW"          "XinguN"             "0.0252302007868013"
## [1] "Rondonia"             "RondoniaHeadwaters"   "-0.00281871696916683"
## [1] "Rondonia"           "Tapajos"            "0.0108927628046649"
## [1] "Rondonia"           "TapajosHeadwaters"  "-0.028639717073863"
## [1] "Rondonia"         "XinguC"           "0.05478390349187"
## [1] "Rondonia"           "XinguHeadwaters"    "0.0145964476148253"
## [1] "Rondonia"            "XinguN"              "0.00523137834561145"
## [1] "RondoniaHeadwaters"   "Tapajos"              "-0.00627183540788083"
## [1] "RondoniaHeadwaters"  "TapajosHeadwaters"   "-0.0100239156399858"
## [1] "RondoniaHeadwaters" "XinguC"             "0.0397210659716431"
## [1] "RondoniaHeadwaters"  "XinguHeadwaters"     "0.00607737990245986"
## [1] "RondoniaHeadwaters" "XinguN"             "0.0125180318478436"
## [1] "Tapajos"             "TapajosHeadwaters"   "-0.0264999224252772"
## [1] "Tapajos"            "XinguC"             "0.0391750615350776"
## [1] "Tapajos"            "XinguHeadwaters"    "0.0177240811872076"
## [1] "Tapajos"            "XinguN"             "-0.013416040027068"
## [1] "TapajosHeadwaters"  "XinguC"             "0.0236256017032573"
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "-0.0168733105970821"
## [1] "TapajosHeadwaters"    "XinguN"               "-0.00388037590405464"
## [1] "XinguC"             "XinguHeadwaters"    "0.0605765409089081"
## [1] "XinguC"             "XinguN"             "0.0166087573830603"
## [1] "XinguHeadwaters"    "XinguN"             "0.0244144314469998"
#repeat for chrZ with MAF>0.05. Compare to above; qualitatively similar. This is the dataset we will use in the publication.
for(species1 in  populations){
  for(species2 in  populations){
    temp<- pixy_fstdf %>% dplyr::filter(pop1==species1) %>% dplyr::filter(pop2==species2) %>% dplyr::filter(chromosome %in% "chr_Z") %>% dplyr::filter(statistic=="avg_wc_fst")
    fst<- mean(temp$value, na.rm=T)
    if(!is.na(fst)){
      print(c(species1, species2, fst))
    }
  }
}
## [1] "AtlanticForest_Ibateguara" "AtlanticForest_Ilheus"    
## [3] "0.0513322375950294"       
## [1] "AtlanticForest_Ibateguara" "InambariE"                
## [3] "0.187087113321001"        
## [1] "AtlanticForest_Ibateguara" "InambariW"                
## [3] "0.198934559616725"        
## [1] "AtlanticForest_Ibateguara" "Rondonia"                 
## [3] "0.170711739481924"        
## [1] "AtlanticForest_Ibateguara" "RondoniaHeadwaters"       
## [3] "0.227442463511849"        
## [1] "AtlanticForest_Ibateguara" "Tapajos"                  
## [3] "0.110409380474036"        
## [1] "AtlanticForest_Ibateguara" "TapajosHeadwaters"        
## [3] "0.163869808948047"        
## [1] "AtlanticForest_Ibateguara" "XinguC"                   
## [3] "0.134149427147073"        
## [1] "AtlanticForest_Ibateguara" "XinguHeadwaters"          
## [3] "0.222978963375677"        
## [1] "AtlanticForest_Ibateguara" "XinguN"                   
## [3] "0.102921602412671"        
## [1] "AtlanticForest_Ilheus" "InambariE"             "0.197329310773962"    
## [1] "AtlanticForest_Ilheus" "InambariW"             "0.196211344562117"    
## [1] "AtlanticForest_Ilheus" "Rondonia"              "0.162987048271417"    
## [1] "AtlanticForest_Ilheus" "RondoniaHeadwaters"    "0.24727387424747"     
## [1] "AtlanticForest_Ilheus" "Tapajos"               "0.115052114954156"    
## [1] "AtlanticForest_Ilheus" "TapajosHeadwaters"     "0.180782020560886"    
## [1] "AtlanticForest_Ilheus" "XinguC"                "0.150145301119965"    
## [1] "AtlanticForest_Ilheus" "XinguHeadwaters"       "0.215133592167579"    
## [1] "AtlanticForest_Ilheus" "XinguN"                "0.119194142359796"    
## [1] "InambariE"         "Rondonia"          "0.079078563208539"
## [1] "InambariE"          "RondoniaHeadwaters" "0.016815148506993" 
## [1] "InambariE"          "Tapajos"            "0.0784239016835723"
## [1] "InambariE"           "TapajosHeadwaters"   "-0.0173859601312689"
## [1] "InambariE"         "XinguC"            "0.173691282597575"
## [1] "InambariE"          "XinguHeadwaters"    "0.0714427962054077"
## [1] "InambariE"          "XinguN"             "0.0485348643941658"
## [1] "InambariW"          "InambariE"          "0.0431811348254715"
## [1] "InambariW"          "Rondonia"           "0.0505195623072376"
## [1] "InambariW"          "RondoniaHeadwaters" "0.110508060710156" 
## [1] "InambariW"         "Tapajos"           "0.082694078216632"
## [1] "InambariW"          "TapajosHeadwaters"  "0.0513280545971937"
## [1] "InambariW"         "XinguC"            "0.168863934610787"
## [1] "InambariW"        "XinguHeadwaters"  "0.12065308412402"
## [1] "InambariW"          "XinguN"             "0.0707112416850594"
## [1] "Rondonia"           "RondoniaHeadwaters" "0.058457270084037" 
## [1] "Rondonia"           "Tapajos"            "0.0429490700462292"
## [1] "Rondonia"           "TapajosHeadwaters"  "-0.045898818143384"
## [1] "Rondonia"          "XinguC"            "0.140871650944715"
## [1] "Rondonia"           "XinguHeadwaters"    "0.0431002059192506"
## [1] "Rondonia"           "XinguN"             "0.0117572710723375"
## [1] "RondoniaHeadwaters" "Tapajos"            "0.0425834631870011"
## [1] "RondoniaHeadwaters"  "TapajosHeadwaters"   "-0.0457933087020198"
## [1] "RondoniaHeadwaters" "XinguC"             "0.132877606846022" 
## [1] "RondoniaHeadwaters" "XinguHeadwaters"    "0.0023313831767947"
## [1] "RondoniaHeadwaters" "XinguN"             "0.0242400126864022"
## [1] "Tapajos"            "TapajosHeadwaters"  "-0.132764846751582"
## [1] "Tapajos"            "XinguC"             "0.0671911894839042"
## [1] "Tapajos"              "XinguHeadwaters"      "-0.00679174933510075"
## [1] "Tapajos"            "XinguN"             "-0.073254963134998"
## [1] "TapajosHeadwaters"  "XinguC"             "0.0614020255229627"
## [1] "TapajosHeadwaters"   "XinguHeadwaters"     "0.00543390649183497"
## [1] "TapajosHeadwaters"   "XinguN"              "-0.0429610918705815"
## [1] "XinguC"            "XinguHeadwaters"   "0.136418471640001"
## [1] "XinguC"              "XinguN"              "-0.0253883046994288"
## [1] "XinguHeadwaters"     "XinguN"              "0.00327829178104021"

Note to self about filtering the dataset and interpreting genetic diversity statistics

Generally, you will need to filter your dataset to remove false negatives and false positives, or you will obtain biased estimates of pi and dxy. Too many false positive variants will inflate pi and dxy; to many false negatives will decrease it. To achieve an accurate estimate, false negatives and false positives should be balanced out, AND the amount of missing data should be proportionally the same for invariant and variant sites. If you filter SNPs more heavily than your filter invariants, you will decrease pi and dxy, but if you don’t filter enough then you will inflate it due to false positives! What to do? The task is to apply the same strength of filters to invariants as variants and ensure that the dataset is good quality. I have applied moderately stringent filters to both invariant and variant sites, avoiding filters that impact variants but not invariant sites (eg., genotype quality, Hardy Weinberg statistics). Instead, I focus on other statistics. For example, variant genotypes with low genotype quality may have low quality because they are low depth (making it hard to distinguish true variants from errors, or homozygotes from heterozygotes). Instead, we can filter for minimum depth, which affects variants and invariants the same way. Or, maybe a site has low genotype quality because it is a collapsed repeat. Instead, we can filter for maximum depth. These choices will not be perfect, but our goal is to strike a reasonable balance instead of avoiding all genotype errors at the cost of severe bias. One could easily bias genetic diversity estimates upwards or downwards by changing bioinformatic filters, so we should not take the exact numbers at their face value, and should be wary about comparing across different datasets.

Plot graph

Now I will plot a genome scan of Fst, pi, and dxy across the genome, for all population pairs. This will look a bit messy with so many populations plotted on top of each other, but might reveal some broad patterns.

#select which chromosomes to plot
# plotting summary statistics across all chromosomes:
#chromosomes<- c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29") #Autosomes
chromosomes<- c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29", "chr_Z") #
#chromosomes<- c("chr_Z") #chr Z

#select which taxa to plot, if you only want a subset plotted. Otherwise, just list all taxa here.
species2 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Rondonia", "RondoniaHeadwaters", "Tapajos", "TapajosHeadwaters", "XinguC", "XinguHeadwaters", "XinguN", "pi")
species1 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Rondonia", "RondoniaHeadwaters", "Tapajos", "TapajosHeadwaters", "XinguC", "XinguHeadwaters", "XinguN")

#"Error: Faceting variables must have at least one value" probably means you might have to switch which taxon is listed as species1 vs species2 

#select which variables to plot
statistics <- c("avg_pi", "avg_dxy", "avg_wc_fst")
#statistics <- "avg_wc_fst"

#plot the data
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
  xlab("Chromosome")+
  ylab("Statistic Value")+
  #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(0.1, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank())+
  theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none")+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("blue", "purple", "red", "#00b300", "#ff781f", "#FFD700", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black")) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Warning: Removed 45348 rows containing missing values (`geom_point()`).

#remove headwaters and repeat
species2 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "AtlanticForest", "Inambari", "InambariE", "InambariW", "Rondonia", "Tapajos", "XinguC", "XinguN", "XinguBelem", "pi")
species1 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "AtlanticForest", "Inambari", "InambariE", "InambariW", "Rondonia", "Tapajos", "XinguC", "XinguN", "XinguBelem")

pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
  xlab("Chromosome")+
  ylab("Statistic Value")+
  #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(0.1, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank())+
  theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none")+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "black", "#00b300", "#ff781f", "#FFD700", "blue", "purple", "red" )) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Warning: Removed 19981 rows containing missing values (`geom_point()`).

#headwaters only. Note that Fst estimates will be particularly messy for these populations because they have very low sample size, precluding precise estimates of Fst
species2 <- c("RondoniaHeadwaters", "TapajosHeadwaters", "XinguHeadwaters", "pi")
species1 <- c("RondoniaHeadwaters", "TapajosHeadwaters", "XinguHeadwaters")
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
  xlab("Chromosome")+
  ylab("Statistic Value")+
  #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(0.1, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank())+
  theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none")+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("#00b300", "#ff781f", "#FFD700", "blue", "purple", "red" )) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Warning: Removed 3445 rows containing missing values (`geom_point()`).

Now, we will make a nicer plot:

chromosomes<- c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29", "chr_Z") #All chromosomes except W (well, and 16, and the GRC)

#let's just look at three populations so that the figure is a bit cleaner and easier to read
species2 <- c("AtlanticForest_Ibateguara", "XinguN", "InambariW", "pi")
species1 <- c("AtlanticForest_Ibateguara", "XinguN", "InambariW")

#species2 <- c("AtlanticForest", "XinguBelem", "Inambari", "pi")
#species1 <- c("AtlanticForest", "XinguBelem", "Inambari")


#plot Fst with MAF>0.05
statistics <- "avg_wc_fst"
pixy_fstdf %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
   #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), panel.spacing = unit(0.2, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank(), axis.title.x=element_blank(), strip.text.x = element_blank(), strip.text.y = element_blank(), axis.line = element_line(colour = 'black', size = 1))+
  #theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none", panel.background = element_rect(fill='transparent'),plot.background = element_rect(fill='transparent', color=NA))+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("#a9cc54", "#000000", "#bd271c")) +
  scale_x_continuous(expand = c(0, 0), limits=c(-1000000,NA)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,1))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2220 rows containing missing values (`geom_point()`).

#ggsave("fst_maf5_window500k_rubrocapilla.png", bg='transparent', width = 60, height = 5, units = "cm")

#plot dxy only
statistics <- "avg_dxy"
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
   #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), panel.spacing = unit(0.2, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank(), axis.title.x=element_blank(), strip.text.x = element_blank(), strip.text.y = element_blank(), axis.line = element_line(colour = 'black', size = 1))+
  #theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none", panel.background = element_rect(fill='transparent'),plot.background = element_rect(fill='transparent', color=NA))+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("#a9cc54", "#000000", "#bd271c")) +
  scale_x_continuous(expand = c(0, 0), limits=c(-1000000,NA)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Warning: Removed 48 rows containing missing values (`geom_point()`).

#ggsave("dxy_window500k_rubrocapilla.png", bg='transparent', width = 60, height = 5, units = "cm")

#plot pi only
statistics <- "avg_pi"
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
   #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), panel.spacing = unit(0.2, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank(), axis.title.x=element_blank(), strip.text.x = element_blank(), strip.text.y = element_blank(), axis.line = element_line(colour = 'black', size = 1))+
  #theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none", panel.background = element_rect(fill='transparent'),plot.background = element_rect(fill='transparent', color=NA))+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("#D55E00", "#CC79A7", "#FFD700")) +
  scale_x_continuous(expand = c(0, 0), limits=c(-1000000,NA)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Warning: Removed 48 rows containing missing values (`geom_point()`).

#ggsave("pi_window500k_rubrocapilla.png", bg='transparent', width = 60, height = 5, units = "cm")

#plot fst, dxy, and pi together
statistics <- c("avg_pi", "avg_dxy", "avg_wc_fst")
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  #mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even", chromosome == "X" ~ "even", TRUE ~ "odd" )) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 1, stroke = 0)+
  #geom_line()+
  #geom_smooth(method=loess, span=1)+
  #facet_grid(statistic ~ chromosome, scales = "free_y", switch = "x", space = "free_x",
  facet_grid(statistic ~ chromosome, scales = "free", switch = "x", space="free_x", drop=T, shrink=T, #changed it so that chroms are proportional to their lengths
             labeller = labeller(statistic = pixy_labeller,
                                 value = label_value))+
  xlab("Chromosome")+
  ylab("Statistic Value")+
  #scale_color_manual(values = c("grey50", "black"))+
  theme_classic()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.spacing = unit(0.2, "cm"), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank())+
  theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 16), legend.text = element_text(size = 16)) +
  theme(legend.position = "none")+ #I prefer to draw my own legend by hand
  #scale_colour_manual(values = c("#CC79A7", "#D55E00", "#FFD700", "#0072B2", "#56B4E9", "#009E73", "#E69F00")) + #palette from Okabe & Ito
  #scale_colour_manual(values = c("#D81B60", "#D55E00", "#1E88E5", "#CC79A7", "#004D40", "#FFD700", "#E69F00")) + #palette from Okabe & Ito
  scale_colour_manual(values = c("#000000", "#D55E00", "#a9cc54", "#CC79A7", "#bd271c", "#FFD700", "#ee3225", "#819b42")) + #using Ceratopipra themed colours for Fst/Dxy to not conflict with the population colour codes
  scale_colour_manual(values = c("#a9cc54", "#D55E00", "#000000", "#CC79A7", "#bd271c", "#FFD700", "#ee3225", "#819b42")) + #using Ceratopipra themed colours for Fst/Dxy to not conflict with the population colour codes
  #scale_colour_manual(values = c("#E69F00", "#D55E00", "#0072B2", "#56B4E9", "#009E73", "#FFD700", "#CC79A7")) +
  scale_x_continuous(expand = c(0, 0), limits=c(-1000000,NA)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,NA))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: Removed 2104 rows containing missing values (`geom_point()`).

#ggsave("fstdxypi_window500k_rubrocapilla.png", bg='transparent', width = 60, height = 15, units = "cm")

Histograms

Now let’s look at the distribution of pi and dxy.

#I want to look at the data without the Headwaters samples
species2 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Rondonia", "Tapajos", "XinguC", "XinguN", "pi")
species1 <- c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Rondonia", "Tapajos", "XinguC", "XinguN")

#I want to look at the data without the Headwaters samples
temp <- pixy_fstdf %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2)  

#choose what you want to plot
statistic_choice=c("avg_dxy", "avg_pi")
#select which chromosomes to include (I want to exclude sex chromosomes)
chromosomes<- c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29") #Autosomes

#plot histograms
#species1 <- c("AtlanticForest", "XinguBelem", "Tapajos", "Rondonia", "Inambari")
statistic_choice <- "avg_pi"
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  dplyr::filter(statistic %in% statistic_choice) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  #dplyr::filter(pop2 %in% species2) %>%  
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  ggplot(aes(x = value, color = interaction(pop2, pop1), fill = interaction(pop2, pop1)))+
  geom_density(alpha=0.1)+
  theme_classic()+
  #scale_colour_manual(values = c("blue", "purple", "red", "#00b300", "#ff781f", "#FFD700"))+
  #scale_fill_manual(values = c("blue", "purple", "red", "#00b300", "#ff781f", "#FFD700"))+
  xlim(c(0,0.025))
## Warning: Removed 85 rows containing non-finite values (`stat_density()`).

statistic_choice <- "avg_dxy"
pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  dplyr::filter(statistic %in% statistic_choice) %>%
  dplyr::filter(pop1 %in% species1) %>%  
  dplyr::filter(pop2 %in% species2) %>%  
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  mutate(chromosome = factor(chromosome, levels = chromosomes)) %>%
  ggplot(aes(x = value, color = interaction(pop2, pop1), fill = interaction(pop2, pop1)))+
  geom_density(alpha=0.1)+
  theme_classic()+
  #scale_colour_manual(values = c("blue", "purple", "red", "#00b300", "#ff781f", "#FFD700"))+
  #scale_fill_manual(values = c("blue", "purple", "red", "#00b300", "#ff781f", "#FFD700"))+
  scale_x_continuous(expand = c(0, 0), limits = c(0,0.015))
## Warning: Removed 628 rows containing non-finite values (`stat_density()`).

pixy_df2 %>%
  dplyr::filter(no_sites.x>100) %>%
  ggplot()+
  geom_density(aes(x = AtlanticForest_Ibateguara), color="#00b300", alpha=0.1)+
  geom_density(aes(x = XinguC), color="#ff781f", alpha=0.1)+
  geom_density(aes(x = AtlanticForest_Ibateguara_XinguC), color="#FFD700", alpha=0.1)+
  geom_density(aes(x = InambariW), color="#47683b", alpha=0.1)+
  geom_density(aes(x = Rondonia), color="dodgerblue", alpha=0.1)+
  geom_density(aes(x = AtlanticForest_Ibateguara_InambariW), color="black", alpha=0.1)+
  theme_classic()+
  scale_x_continuous(expand = c(0, 0), limits = c(0,0.02))
## Warning: Removed 41 rows containing non-finite values (`stat_density()`).
## Warning: Removed 41 rows containing non-finite values (`stat_density()`).
## Warning: Removed 43 rows containing non-finite values (`stat_density()`).
## Warning: Removed 41 rows containing non-finite values (`stat_density()`).
## Removed 41 rows containing non-finite values (`stat_density()`).
## Warning: Removed 43 rows containing non-finite values (`stat_density()`).

#how many sites are in each window
pixy_df2 %>%
  #dplyr::filter(no_sites.x>90) %>%
  ggplot()+
  #geom_density(aes(x = no_sites.x), alpha=0.1)+
  geom_density(aes(x = no_snps), alpha=0.1)+
  theme_classic()
## Warning: Removed 1 rows containing non-finite values (`stat_density()`).

median(pixy_df2$no_snps, na.rm=T) # for 100k windows, 126 for 500k,  for 1Mb
## [1] 25
# old data: 21 for 100k windows, 107 for 500k, 225 for 1Mb
median(pixy_df2$no_sites.x, na.rm=T) # for 100k windows, 1582 for 500K,  for 1Mb
## [1] 567.5
#old data: 236 for 100k windows, 1339.5 for 500K, 2811 for 1Mb

Look how tightly dxy and pi correspond.

#choose the max value for the axes, to ensure that all plots have the same axis scales
xlimit=0.015
ylimit=0.015

#plot pi vs pi
pixy_df2 %>%
  ggplot(aes(x = XinguC, y = AtlanticForest_Ibateguara))+
  geom_point(alpha=0.1)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)+
  geom_abline(slope=1)
## Warning: Removed 42 rows containing missing values (`geom_point()`).

#plot dxy vs pi 
pixy_df2 %>%
  ggplot(aes(x = AtlanticForest_Ibateguara_XinguC, y = AtlanticForest_Ibateguara))+
  geom_point(alpha=0.1)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)+
  geom_abline(slope=1)
## Warning: Removed 62 rows containing missing values (`geom_point()`).

pixy_df2 %>%
  ggplot(aes(x = AtlanticForest_Ibateguara_InambariW, y = AtlanticForest_Ibateguara))+
  geom_point(alpha=0.1)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)+
  geom_abline(slope=1)
## Warning: Removed 62 rows containing missing values (`geom_point()`).

pixy_df2 %>%
  ggplot(aes(x = AtlanticForest_Ibateguara_XinguC, y = AtlanticForest_Ibateguara_InambariW))+
  geom_point(alpha=0.1)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)+
  geom_abline(slope=1)
## Warning: Removed 44 rows containing missing values (`geom_point()`).

pixy_df2 %>%
  ggplot(aes(x = AtlanticForest_Ibateguara_XinguHeadwaters, y = Tapajos_XinguHeadwaters))+
  geom_point(alpha=0.1)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)+
  geom_abline(slope=1)
## Warning: Removed 44 rows containing missing values (`geom_point()`).

xlimit=1
ylimit=0.03

#plot fst vs dxy
pixy_df2 %>%
  ggplot(aes(x = Tapajos_Fst_XinguHeadwaters, y = AtlanticForest_Ibateguara_XinguHeadwaters))+
  geom_point(alpha=0.05)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)
## Warning: Removed 54 rows containing missing values (`geom_point()`).

pixy_df2 %>%
  ggplot(aes(x = AtlanticForest_Ibateguara_Fst_XinguC, y = AtlanticForest_Ibateguara_XinguC))+
  geom_point(alpha=0.05)+
  coord_cartesian(xlim=c(0,xlimit), ylim=c(0,ylimit), expand=F)
## Warning: Removed 67 rows containing missing values (`geom_point()`).

Plot pi per species

I would like to compare the sequence diversity within each lineage

#firstly, can decode to filter to look at only some chromosomes (eg, exclude sex chromosomes). I am selecting only the autosomes.
chromosomes<-c("chr_1", "chr_1A", "chr_2", "chr_3", "chr_4", "chr_4A", "chr_5", "chr_6", "chr_7", "chr_8", "chr_9", "chr_10", "chr_11", "chr_12", "chr_13", "chr_14", "chr_15", "chr_17", "chr_18", "chr_19", "chr_20", "chr_21", "chr_22", "chr_23", "chr_24", "chr_25", "chr_26", "chr_27", "chr_28", "chr_29")

#look at just the average per population
avg_pis<- pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  dplyr::filter(is.na(pop2)) %>% 
  spread(key=statistic, value=value) %>%
  group_by(pop1) %>%
  summarise(pi=sum(count_diffs, na.rm=T)/sum(count_comparisons, na.rm=T), 
            stdev=stats::sd(avg_pi, na.rm=T))

#calculate boxplot around values
pi_df<- pixy_df %>%
  dplyr::filter(chromosome %in% chromosomes) %>%
  dplyr::filter(is.na(pop2)) %>% 
  spread(key=statistic, value=value)

#plot the pi
pi_df %>%
  filter(pop1 %in% c("AtlanticForest_Ibateguara", "AtlanticForest_Ilheus", "InambariE", "InambariW", "Rondonia", "RondoniaHeadwaters", "Tapajos", "TapajosHeadwaters", "XinguC", "XinguHeadwaters", "XinguN")) %>%
  ggplot(aes(x = pop1, y = pi, colour=pi))+
    geom_boxplot(aes(x = pop1, y = avg_pi), colour="black")+
    ylim(0,NA)+
  theme_classic()
## Warning: Removed 100 rows containing non-finite values (`stat_boxplot()`).

chrZ investigation

This investigation is to visualize Fst between males and females to identify the area around the pseudoautosomal region where chrW reads likely map in females.

my_y_title <- expression(paste(italic("F")[ST], "  ", "(Male vs. Female ", italic("Ceratopipra rubrocapilla"), ")"))

#plot fst between Males and Females
statistics <- c("avg_wc_fst")
pixy_chrZ_investigation_fstdf %>%
  mutate(pop2 = replace_na(pop2, "pi")) %>%
  filter(statistic %in% statistics) %>%
  ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = interaction(pop2, pop1)))+
  geom_point(size = 2, alpha = 0.5, stroke = 0)+
  #ylab("Fst (Male vs Female Ceratopipra rubrocapilla)")+
  labs(y=my_y_title, x="Position on Chromosome Z")+
  theme_classic()+
  theme(panel.spacing = unit(0.2, "cm"), strip.background = element_blank(), strip.placement = "outside")+
  theme(text=element_text(family="Times New Roman"), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), legend.text = element_text(size = 16)) +
  theme(legend.position = "none")+ #I prefer to draw my own legend by hand
  scale_colour_manual(values = c("#D55E00")) + #using Ceratopipra themed colours 
  scale_x_continuous(expand = c(0, 0), limits=c(NA,NA)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0,1.05))
## Warning: Removed 7097 rows containing missing values (`geom_point()`).

#ggsave("Fst_MalevsFemale_individualSNPs_Ceratopiprarubrocapilla.png", bg='transparent', width = 20, height = 10, units = "cm")

There is a large peak of high Fst near the 5’ end of the chromosome, which is where the pseudoautosomal region seems to be in some other manakins as well. We can exclude this region from downstream analyses. However, there are scattered peaks of high Fst in other areas as well, so we should still implement another filter: removing sites where females are called as heterozygous.